webcontext-ai 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +583 -0
- package/dist/browser/manager.d.ts +47 -0
- package/dist/browser/manager.d.ts.map +1 -0
- package/dist/browser/manager.js +215 -0
- package/dist/browser/manager.js.map +1 -0
- package/dist/cache/cache.d.ts +22 -0
- package/dist/cache/cache.d.ts.map +1 -0
- package/dist/cache/cache.js +150 -0
- package/dist/cache/cache.js.map +1 -0
- package/dist/chunking/chunker.d.ts +26 -0
- package/dist/chunking/chunker.d.ts.map +1 -0
- package/dist/chunking/chunker.js +208 -0
- package/dist/chunking/chunker.js.map +1 -0
- package/dist/cli/index.d.ts +3 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +406 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/core/pipeline.d.ts +35 -0
- package/dist/core/pipeline.d.ts.map +1 -0
- package/dist/core/pipeline.js +476 -0
- package/dist/core/pipeline.js.map +1 -0
- package/dist/core/stream.d.ts +48 -0
- package/dist/core/stream.d.ts.map +1 -0
- package/dist/core/stream.js +72 -0
- package/dist/core/stream.js.map +1 -0
- package/dist/core/types.d.ts +259 -0
- package/dist/core/types.d.ts.map +1 -0
- package/dist/core/types.js +4 -0
- package/dist/core/types.js.map +1 -0
- package/dist/export/index.d.ts +3 -0
- package/dist/export/index.d.ts.map +1 -0
- package/dist/export/index.js +8 -0
- package/dist/export/index.js.map +1 -0
- package/dist/export/templates.d.ts +25 -0
- package/dist/export/templates.d.ts.map +1 -0
- package/dist/export/templates.js +76 -0
- package/dist/export/templates.js.map +1 -0
- package/dist/export/vectordb.d.ts +21 -0
- package/dist/export/vectordb.d.ts.map +1 -0
- package/dist/export/vectordb.js +101 -0
- package/dist/export/vectordb.js.map +1 -0
- package/dist/extractors/content.d.ts +23 -0
- package/dist/extractors/content.d.ts.map +1 -0
- package/dist/extractors/content.js +328 -0
- package/dist/extractors/content.js.map +1 -0
- package/dist/extractors/github.d.ts +19 -0
- package/dist/extractors/github.d.ts.map +1 -0
- package/dist/extractors/github.js +150 -0
- package/dist/extractors/github.js.map +1 -0
- package/dist/extractors/images.d.ts +20 -0
- package/dist/extractors/images.d.ts.map +1 -0
- package/dist/extractors/images.js +73 -0
- package/dist/extractors/images.js.map +1 -0
- package/dist/extractors/pdf.d.ts +11 -0
- package/dist/extractors/pdf.d.ts.map +1 -0
- package/dist/extractors/pdf.js +107 -0
- package/dist/extractors/pdf.js.map +1 -0
- package/dist/extractors/screenshot.d.ts +21 -0
- package/dist/extractors/screenshot.d.ts.map +1 -0
- package/dist/extractors/screenshot.js +85 -0
- package/dist/extractors/screenshot.js.map +1 -0
- package/dist/index.d.ts +70 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +206 -0
- package/dist/index.js.map +1 -0
- package/dist/mcp-server.d.ts +3 -0
- package/dist/mcp-server.d.ts.map +1 -0
- package/dist/mcp-server.js +108 -0
- package/dist/mcp-server.js.map +1 -0
- package/dist/sdk/client.d.ts +48 -0
- package/dist/sdk/client.d.ts.map +1 -0
- package/dist/sdk/client.js +120 -0
- package/dist/sdk/client.js.map +1 -0
- package/dist/sdk/mcp.d.ts +12 -0
- package/dist/sdk/mcp.d.ts.map +1 -0
- package/dist/sdk/mcp.js +146 -0
- package/dist/sdk/mcp.js.map +1 -0
- package/dist/sdk/server.d.ts +5 -0
- package/dist/sdk/server.d.ts.map +1 -0
- package/dist/sdk/server.js +158 -0
- package/dist/sdk/server.js.map +1 -0
- package/dist/search/vector.d.ts +26 -0
- package/dist/search/vector.d.ts.map +1 -0
- package/dist/search/vector.js +142 -0
- package/dist/search/vector.js.map +1 -0
- package/dist/transformers/markdown.d.ts +21 -0
- package/dist/transformers/markdown.d.ts.map +1 -0
- package/dist/transformers/markdown.js +242 -0
- package/dist/transformers/markdown.js.map +1 -0
- package/dist/utils/dedup.d.ts +20 -0
- package/dist/utils/dedup.d.ts.map +1 -0
- package/dist/utils/dedup.js +61 -0
- package/dist/utils/dedup.js.map +1 -0
- package/dist/utils/index.d.ts +6 -0
- package/dist/utils/index.d.ts.map +1 -0
- package/dist/utils/index.js +15 -0
- package/dist/utils/index.js.map +1 -0
- package/dist/utils/metrics.d.ts +16 -0
- package/dist/utils/metrics.d.ts.map +1 -0
- package/dist/utils/metrics.js +28 -0
- package/dist/utils/metrics.js.map +1 -0
- package/dist/utils/scheduler.d.ts +19 -0
- package/dist/utils/scheduler.d.ts.map +1 -0
- package/dist/utils/scheduler.js +63 -0
- package/dist/utils/scheduler.js.map +1 -0
- package/dist/utils/sitemap.d.ts +17 -0
- package/dist/utils/sitemap.d.ts.map +1 -0
- package/dist/utils/sitemap.js +118 -0
- package/dist/utils/sitemap.js.map +1 -0
- package/dist/utils/validation.d.ts +142 -0
- package/dist/utils/validation.d.ts.map +1 -0
- package/dist/utils/validation.js +35 -0
- package/dist/utils/validation.js.map +1 -0
- package/dist/utils/webhook.d.ts +21 -0
- package/dist/utils/webhook.d.ts.map +1 -0
- package/dist/utils/webhook.js +108 -0
- package/dist/utils/webhook.js.map +1 -0
- package/package.json +109 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 sumeethmoolya
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,583 @@
|
|
|
1
|
+
# WebContext AI
|
|
2
|
+
|
|
3
|
+
> Turn any web content into clean AI-ready context — with crawling, chunking, semantic search, vector DB export, and MCP tools.
|
|
4
|
+
|
|
5
|
+
WebContext is a developer tool that crawls, extracts, cleans, and structures web content for consumption by LLMs, RAG pipelines, and AI agents. Think of it as Firecrawl — but open-source, self-hosted, and optimized for developer documentation.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- **Smart Extraction** — Removes ads, navigation, cookie banners, and noise automatically
|
|
10
|
+
- **Code Preservation** — Keeps code blocks intact with language detection (15+ languages)
|
|
11
|
+
- **Recursive Crawling** — Crawl entire documentation sites with depth control and sitemap support
|
|
12
|
+
- **Token-Aware Chunking** — Semantic, heading-based, paragraph, or fixed-size chunking using tiktoken
|
|
13
|
+
- **Semantic Search** — TF-IDF vector search over extracted content chunks
|
|
14
|
+
- **Vector DB Export** — Export chunks ready for Pinecone, Chroma, Weaviate, Qdrant
|
|
15
|
+
- **PDF Extraction** — Extract text from PDF files and URLs
|
|
16
|
+
- **GitHub Extraction** — Fetch README and /docs from any GitHub repository
|
|
17
|
+
- **Screenshot Capture** — Take full-page screenshots of web pages
|
|
18
|
+
- **Image Extraction** — Extract images with alt text and surrounding context
|
|
19
|
+
- **Streaming** — Real-time event-based output as pages are crawled
|
|
20
|
+
- **Output Templates** — Built-in templates (LLM, XML, minimal) or define your own
|
|
21
|
+
- **MCP Server** — Model Context Protocol tools for AI agents (Cursor, Claude, Amazon Q)
|
|
22
|
+
- **Browser Rendering** — Optional Playwright-powered JS rendering for SPAs
|
|
23
|
+
- **Rate Limiting** — Token bucket rate limiter with configurable requests/second
|
|
24
|
+
- **Retry with Backoff** — Exponential backoff on 429/5xx responses
|
|
25
|
+
- **robots.txt Compliance** — Respects robots.txt by default
|
|
26
|
+
- **Caching** — Dual-layer (LRU memory + file-based) with TTL and content diff detection
|
|
27
|
+
- **Content Diffing** — Detect what changed between crawls via content hashing
|
|
28
|
+
- **Deduplication** — Automatically skips duplicate content during crawls
|
|
29
|
+
- **Sitemap Auto-Discovery** — Finds and uses sitemaps automatically before crawling
|
|
30
|
+
- **Link Resolution** — Converts relative links to absolute URLs in output
|
|
31
|
+
- **Focus Modes** — Extract only articles, code, API references, or READMEs
|
|
32
|
+
- **Plugin System** — Hook into any phase of the pipeline (pre/post fetch, extract, transform, chunk)
|
|
33
|
+
- **Checkpoint/Resume** — Save crawl state to disk and resume interrupted crawls
|
|
34
|
+
- **Scheduling** — Cron-based recurring crawls for keeping context fresh
|
|
35
|
+
- **Webhooks** — Get notified when crawls complete or content changes
|
|
36
|
+
- **LangChain Compatible** — Document loader adapter included
|
|
37
|
+
- **Metrics** — Track crawl performance, cache hit rates, token usage
|
|
38
|
+
- **Input Validation** — Zod-based validation on all inputs
|
|
39
|
+
|
|
40
|
+
## Quick Start
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
npm install webcontext-ai
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
> **Note:** WebContext works out of the box for most sites (server-rendered). For JavaScript-heavy SPAs, you also need Playwright:
|
|
47
|
+
> ```bash
|
|
48
|
+
> npm install playwright
|
|
49
|
+
> npx playwright install chromium
|
|
50
|
+
> ```
|
|
51
|
+
> Then pass `{ javascript: true }` to enable browser rendering.
|
|
52
|
+
|
|
53
|
+
> **Optional extras:**
|
|
54
|
+
> ```bash
|
|
55
|
+
> npm install pdf-parse # For PDF extraction
|
|
56
|
+
> npm install playwright # For screenshots & JS rendering
|
|
57
|
+
> ```
|
|
58
|
+
|
|
59
|
+
## CLI Usage
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
# Extract a single page as markdown
|
|
63
|
+
webcontext extract https://docs.example.com/api --format markdown
|
|
64
|
+
|
|
65
|
+
# Crawl documentation recursively
|
|
66
|
+
webcontext crawl https://docs.example.com --depth 3 --max-pages 100 -o docs.md
|
|
67
|
+
|
|
68
|
+
# Generate LLM-ready context with token budget
|
|
69
|
+
webcontext context https://docs.example.com/quickstart --budget 4000
|
|
70
|
+
|
|
71
|
+
# Semantic search within a page
|
|
72
|
+
webcontext search https://docs.example.com/api "authentication"
|
|
73
|
+
|
|
74
|
+
# Export for vector database
|
|
75
|
+
webcontext export https://docs.example.com --to pinecone -o chunks.json
|
|
76
|
+
webcontext export https://docs.example.com --to chroma --namespace my-docs
|
|
77
|
+
|
|
78
|
+
# Extract GitHub repository
|
|
79
|
+
webcontext github https://github.com/user/repo -o repo-docs.md
|
|
80
|
+
|
|
81
|
+
# Extract PDF
|
|
82
|
+
webcontext pdf https://example.com/paper.pdf -o paper.md
|
|
83
|
+
webcontext pdf ./local-file.pdf -o extracted.md
|
|
84
|
+
|
|
85
|
+
# Take screenshot
|
|
86
|
+
webcontext screenshot https://docs.example.com -o ./screenshots --full-page
|
|
87
|
+
|
|
88
|
+
# Validate a URL
|
|
89
|
+
webcontext validate https://docs.example.com
|
|
90
|
+
|
|
91
|
+
# Schedule recurring crawls
|
|
92
|
+
webcontext schedule https://docs.example.com --cron "0 */6 * * *" -o ./docs-cache
|
|
93
|
+
|
|
94
|
+
# Start API server
|
|
95
|
+
webcontext serve --port 3456
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## SDK Usage
|
|
99
|
+
|
|
100
|
+
```typescript
|
|
101
|
+
import { WebContext } from 'webcontext-ai';
|
|
102
|
+
|
|
103
|
+
const wc = new WebContext({
|
|
104
|
+
cache: { enabled: true, ttl: 3600, maxSize: 500, contentHashing: true },
|
|
105
|
+
chunking: { maxTokens: 1500, strategy: 'semantic', overlap: 100 },
|
|
106
|
+
concurrency: 5,
|
|
107
|
+
metrics: true,
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
// Extract single page
|
|
111
|
+
const result = await wc.extract('https://docs.example.com/api');
|
|
112
|
+
console.log(result.pages[0].markdown);
|
|
113
|
+
|
|
114
|
+
// Crawl documentation site
|
|
115
|
+
const docs = await wc.crawlDocs('https://docs.example.com', {
|
|
116
|
+
depth: 2,
|
|
117
|
+
maxPages: 50,
|
|
118
|
+
onProgress: (p) => console.log(`${p.pagesProcessed}/${p.totalDiscovered}`),
|
|
119
|
+
});
|
|
120
|
+
|
|
121
|
+
// Get RAG-ready chunks
|
|
122
|
+
const chunks = await wc.toChunks('https://docs.example.com/guide');
|
|
123
|
+
|
|
124
|
+
// Generate token-budgeted context for LLM
|
|
125
|
+
const context = await wc.toContext('https://docs.example.com', { maxTokens: 4000 });
|
|
126
|
+
|
|
127
|
+
// Semantic search
|
|
128
|
+
const results = await wc.search('https://docs.example.com/api', 'authentication', 5);
|
|
129
|
+
|
|
130
|
+
// Extract GitHub repo
|
|
131
|
+
const repo = await wc.extractGitHub('https://github.com/user/repo');
|
|
132
|
+
|
|
133
|
+
// Extract PDF
|
|
134
|
+
const pdf = await wc.extractPdf('https://example.com/paper.pdf');
|
|
135
|
+
|
|
136
|
+
// Export for vector DB
|
|
137
|
+
const pineconeData = await wc.exportForVectorDB('https://docs.example.com', {
|
|
138
|
+
format: 'pinecone',
|
|
139
|
+
namespace: 'my-docs',
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
// Stream results in real-time
|
|
143
|
+
const stream = wc.extractStream('https://docs.example.com');
|
|
144
|
+
stream.onPage((page) => console.log(`Extracted: ${page.title}`));
|
|
145
|
+
stream.onDone((result) => console.log(`Done! ${result.stats.totalTokens} tokens`));
|
|
146
|
+
|
|
147
|
+
// Webhooks
|
|
148
|
+
wc.registerWebhook({
|
|
149
|
+
url: 'https://your-server.com/webhook',
|
|
150
|
+
events: ['crawl.complete', 'content.changed'],
|
|
151
|
+
secret: 'your-secret',
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
// Cleanup
|
|
155
|
+
wc.dispose();
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
## Vector DB Export
|
|
159
|
+
|
|
160
|
+
Export chunks in formats ready for direct import into popular vector databases:
|
|
161
|
+
|
|
162
|
+
```typescript
|
|
163
|
+
import { WebContext } from 'webcontext-ai';
|
|
164
|
+
|
|
165
|
+
const wc = new WebContext();
|
|
166
|
+
const result = await wc.extract('https://docs.example.com');
|
|
167
|
+
|
|
168
|
+
// Export as Pinecone format
|
|
169
|
+
const pinecone = await wc.exportForVectorDB('https://docs.example.com', { format: 'pinecone', namespace: 'docs' });
|
|
170
|
+
|
|
171
|
+
// Export as Chroma format
|
|
172
|
+
const chroma = await wc.exportForVectorDB('https://docs.example.com', { format: 'chroma', collection: 'my-docs' });
|
|
173
|
+
|
|
174
|
+
// Supported formats: pinecone, chroma, weaviate, qdrant, json
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
CLI:
|
|
178
|
+
```bash
|
|
179
|
+
webcontext export https://docs.example.com --to pinecone -o pinecone-chunks.json
|
|
180
|
+
webcontext export https://docs.example.com --to chroma --namespace docs -o chroma-chunks.json
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
## Output Templates
|
|
184
|
+
|
|
185
|
+
Format extracted content using built-in or custom templates:
|
|
186
|
+
|
|
187
|
+
```typescript
|
|
188
|
+
import { OutputFormatter } from 'webcontext-ai';
|
|
189
|
+
|
|
190
|
+
const fmt = new OutputFormatter();
|
|
191
|
+
|
|
192
|
+
// Built-in templates: default, llm, xml-tags, summary, minimal
|
|
193
|
+
fmt.formatPage(page, 'llm');
|
|
194
|
+
// Output: <context source="https://..." tokens="1234">...content...</context>
|
|
195
|
+
|
|
196
|
+
fmt.formatPage(page, 'xml-tags');
|
|
197
|
+
// Output: <document><title>...</title><source>...</source><content>...</content></document>
|
|
198
|
+
|
|
199
|
+
// Register custom template
|
|
200
|
+
fmt.register({
|
|
201
|
+
name: 'my-format',
|
|
202
|
+
template: '---\ntitle: {{title}}\nsource: {{url}}\n---\n\n{{markdown}}',
|
|
203
|
+
});
|
|
204
|
+
fmt.formatPage(page, 'my-format');
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
## MCP Tools (AI Agent Integration)
|
|
208
|
+
|
|
209
|
+
Use WebContext as a tool inside **Cursor**, **Claude Desktop**, **Amazon Q Developer**, or any MCP-compatible AI agent.
|
|
210
|
+
|
|
211
|
+
### Setup for Claude Desktop
|
|
212
|
+
|
|
213
|
+
Add to your `claude_desktop_config.json`:
|
|
214
|
+
|
|
215
|
+
```json
|
|
216
|
+
{
|
|
217
|
+
"mcpServers": {
|
|
218
|
+
"webcontext": {
|
|
219
|
+
"command": "npx",
|
|
220
|
+
"args": ["-y", "webcontext-ai", "webcontext-mcp"]
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
### Setup for Cursor
|
|
227
|
+
|
|
228
|
+
Add to `.cursor/mcp.json` in your project:
|
|
229
|
+
|
|
230
|
+
```json
|
|
231
|
+
{
|
|
232
|
+
"mcpServers": {
|
|
233
|
+
"webcontext": {
|
|
234
|
+
"command": "npx",
|
|
235
|
+
"args": ["-y", "webcontext-ai", "webcontext-mcp"]
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
### Setup for Amazon Q Developer / Kiro
|
|
242
|
+
|
|
243
|
+
Add to your MCP configuration:
|
|
244
|
+
|
|
245
|
+
```json
|
|
246
|
+
{
|
|
247
|
+
"mcpServers": {
|
|
248
|
+
"webcontext": {
|
|
249
|
+
"command": "npx",
|
|
250
|
+
"args": ["-y", "webcontext-ai", "webcontext-mcp"]
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
### Available MCP Tools
|
|
257
|
+
|
|
258
|
+
| Tool | Description | Example Prompt |
|
|
259
|
+
|------|-------------|----------------|
|
|
260
|
+
| `webcontext_extract` | Extract clean content from a URL | "Extract the React docs for useState" |
|
|
261
|
+
| `webcontext_crawl` | Crawl a documentation site | "Crawl the Express.js guide, 3 levels deep" |
|
|
262
|
+
| `webcontext_search` | Semantic search within a page | "Search the Next.js docs for 'server components'" |
|
|
263
|
+
| `webcontext_chunk` | Get RAG-ready chunks | "Chunk the TailwindCSS docs for my vector DB" |
|
|
264
|
+
| `webcontext_summarize` | Summarize a web page | "Summarize this API reference page" |
|
|
265
|
+
| `webcontext_github` | Extract GitHub repo docs | "Get the README from TanStack/query" |
|
|
266
|
+
| `webcontext_pdf` | Extract PDF content | "Extract text from this research paper PDF" |
|
|
267
|
+
|
|
268
|
+
## Streaming
|
|
269
|
+
|
|
270
|
+
Get results in real-time as pages are processed:
|
|
271
|
+
|
|
272
|
+
```typescript
|
|
273
|
+
const stream = wc.extractStream('https://docs.example.com');
|
|
274
|
+
|
|
275
|
+
stream.onPage((page) => {
|
|
276
|
+
console.log(`✓ ${page.title} (${page.codeBlocks.length} code blocks)`);
|
|
277
|
+
});
|
|
278
|
+
|
|
279
|
+
stream.onProgress((p) => {
|
|
280
|
+
console.log(`${p.pagesProcessed}/${p.totalDiscovered} - ${p.currentUrl}`);
|
|
281
|
+
});
|
|
282
|
+
|
|
283
|
+
stream.onDone((result) => {
|
|
284
|
+
console.log(`Complete: ${result.stats.totalTokens} tokens`);
|
|
285
|
+
});
|
|
286
|
+
|
|
287
|
+
// Or await completion
|
|
288
|
+
const result = await stream.toPromise();
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
## GitHub Extraction
|
|
292
|
+
|
|
293
|
+
Extract README and documentation from any public GitHub repository:
|
|
294
|
+
|
|
295
|
+
```typescript
|
|
296
|
+
// Just the README
|
|
297
|
+
const readme = await wc.extractGitHub('https://github.com/TanStack/query');
|
|
298
|
+
|
|
299
|
+
// README + /docs folder
|
|
300
|
+
const full = await wc.extractGitHub('https://github.com/TanStack/query', { depth: 1 });
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
CLI:
|
|
304
|
+
```bash
|
|
305
|
+
webcontext github https://github.com/expressjs/express -o express-docs.md
|
|
306
|
+
```
|
|
307
|
+
|
|
308
|
+
## PDF Extraction
|
|
309
|
+
|
|
310
|
+
Extract text from PDF files (requires `npm install pdf-parse`):
|
|
311
|
+
|
|
312
|
+
```typescript
|
|
313
|
+
// From URL
|
|
314
|
+
const paper = await wc.extractPdf('https://example.com/paper.pdf');
|
|
315
|
+
|
|
316
|
+
// From local file
|
|
317
|
+
const local = await wc.extractPdf('./documents/spec.pdf');
|
|
318
|
+
```
|
|
319
|
+
|
|
320
|
+
CLI:
|
|
321
|
+
```bash
|
|
322
|
+
webcontext pdf https://arxiv.org/pdf/1706.03762 -o transformer-paper.md
|
|
323
|
+
webcontext pdf ./local-file.pdf --format chunks -o chunks.json
|
|
324
|
+
```
|
|
325
|
+
|
|
326
|
+
## Webhooks
|
|
327
|
+
|
|
328
|
+
Get notified when crawls complete or content changes:
|
|
329
|
+
|
|
330
|
+
```typescript
|
|
331
|
+
wc.registerWebhook({
|
|
332
|
+
url: 'https://your-server.com/webhook',
|
|
333
|
+
secret: 'hmac-secret', // Signs payload with HMAC-SHA256
|
|
334
|
+
events: ['crawl.complete', 'crawl.error', 'content.changed'],
|
|
335
|
+
});
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
Webhook payload example:
|
|
339
|
+
```json
|
|
340
|
+
{
|
|
341
|
+
"event": "content.changed",
|
|
342
|
+
"timestamp": "2024-01-15T10:30:00Z",
|
|
343
|
+
"data": {
|
|
344
|
+
"changedPages": 3,
|
|
345
|
+
"diffs": [
|
|
346
|
+
{ "url": "https://docs.example.com/api", "addedSections": ["New Endpoint"], "removedSections": [] }
|
|
347
|
+
]
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
```
|
|
351
|
+
|
|
352
|
+
## Client SDK (Remote Server)
|
|
353
|
+
|
|
354
|
+
```typescript
|
|
355
|
+
import { WebContextClient } from 'webcontext-ai/sdk/client';
|
|
356
|
+
|
|
357
|
+
const client = new WebContextClient({ serverUrl: 'http://localhost:3456' });
|
|
358
|
+
const markdown = await client.toMarkdown('https://example.com');
|
|
359
|
+
const results = await client.search('https://example.com', 'pricing', 3);
|
|
360
|
+
```
|
|
361
|
+
|
|
362
|
+
## LangChain Integration
|
|
363
|
+
|
|
364
|
+
```typescript
|
|
365
|
+
import { WebContextLoader } from 'webcontext-ai/sdk/client';
|
|
366
|
+
|
|
367
|
+
const loader = new WebContextLoader();
|
|
368
|
+
const docs = await loader.load('https://docs.example.com/guide');
|
|
369
|
+
// Returns LangChain-compatible Document[] with pageContent + metadata
|
|
370
|
+
```
|
|
371
|
+
|
|
372
|
+
## Plugin System
|
|
373
|
+
|
|
374
|
+
```typescript
|
|
375
|
+
import { WebContext, WebContextPlugin } from 'webcontext-ai';
|
|
376
|
+
|
|
377
|
+
const myPlugin: WebContextPlugin = {
|
|
378
|
+
name: 'custom-cleaner',
|
|
379
|
+
hooks: {
|
|
380
|
+
'post-extract': async (ctx) => {
|
|
381
|
+
ctx.extracted.markdown = ctx.extracted.markdown.replace(/CONFIDENTIAL/g, '[REDACTED]');
|
|
382
|
+
return ctx;
|
|
383
|
+
},
|
|
384
|
+
'post-chunk': async (ctx) => {
|
|
385
|
+
ctx.chunks = ctx.chunks.filter(c => c.tokens > 50);
|
|
386
|
+
return ctx;
|
|
387
|
+
},
|
|
388
|
+
},
|
|
389
|
+
};
|
|
390
|
+
|
|
391
|
+
const wc = new WebContext({ plugins: [myPlugin] });
|
|
392
|
+
```
|
|
393
|
+
|
|
394
|
+
## API Server
|
|
395
|
+
|
|
396
|
+
```bash
|
|
397
|
+
webcontext serve --port 3456
|
|
398
|
+
```
|
|
399
|
+
|
|
400
|
+
| Method | Path | Description |
|
|
401
|
+
|--------|------|-------------|
|
|
402
|
+
| POST | `/extract` | Extract content from a single URL |
|
|
403
|
+
| POST | `/crawl` | Recursively crawl a documentation site |
|
|
404
|
+
| POST | `/context` | Generate LLM-ready context with token budget |
|
|
405
|
+
| POST | `/chunks` | Get RAG-ready content chunks |
|
|
406
|
+
| POST | `/search` | Semantic search within extracted content |
|
|
407
|
+
| GET | `/metrics` | View crawl metrics |
|
|
408
|
+
| POST | `/schedule` | Schedule recurring crawls |
|
|
409
|
+
| DELETE | `/schedule/:id` | Cancel a scheduled job |
|
|
410
|
+
| GET | `/health` | Health check |
|
|
411
|
+
|
|
412
|
+
## Configuration
|
|
413
|
+
|
|
414
|
+
```typescript
|
|
415
|
+
const wc = new WebContext({
|
|
416
|
+
browser: {
|
|
417
|
+
headless: true,
|
|
418
|
+
proxy: 'http://proxy:8080',
|
|
419
|
+
userAgent: 'MyBot/1.0',
|
|
420
|
+
viewport: { width: 1280, height: 720 },
|
|
421
|
+
},
|
|
422
|
+
extraction: {
|
|
423
|
+
removeSelectors: ['.sidebar', '.footer'],
|
|
424
|
+
contentSelectors: ['.doc-content'],
|
|
425
|
+
preserveImages: true,
|
|
426
|
+
preserveTables: true,
|
|
427
|
+
},
|
|
428
|
+
chunking: {
|
|
429
|
+
maxTokens: 1500,
|
|
430
|
+
overlap: 100,
|
|
431
|
+
strategy: 'semantic', // 'semantic' | 'heading' | 'fixed' | 'paragraph'
|
|
432
|
+
preserveCodeBlocks: true,
|
|
433
|
+
preserveHeadings: true,
|
|
434
|
+
},
|
|
435
|
+
cache: {
|
|
436
|
+
enabled: true,
|
|
437
|
+
ttl: 3600,
|
|
438
|
+
maxSize: 500,
|
|
439
|
+
directory: './.webcontext-cache',
|
|
440
|
+
contentHashing: true,
|
|
441
|
+
},
|
|
442
|
+
retry: {
|
|
443
|
+
maxRetries: 3,
|
|
444
|
+
backoffMs: 1000,
|
|
445
|
+
backoffMultiplier: 2,
|
|
446
|
+
retryOn: [429, 500, 502, 503, 504],
|
|
447
|
+
},
|
|
448
|
+
rateLimit: {
|
|
449
|
+
requestsPerSecond: 2,
|
|
450
|
+
burstSize: 5,
|
|
451
|
+
},
|
|
452
|
+
concurrency: 3,
|
|
453
|
+
metrics: true,
|
|
454
|
+
plugins: [],
|
|
455
|
+
});
|
|
456
|
+
```
|
|
457
|
+
|
|
458
|
+
## Real-World Examples
|
|
459
|
+
|
|
460
|
+
### Feed documentation into your AI chatbot (RAG)
|
|
461
|
+
|
|
462
|
+
```typescript
|
|
463
|
+
import { WebContext } from 'webcontext-ai';
|
|
464
|
+
|
|
465
|
+
const wc = new WebContext();
|
|
466
|
+
const result = await wc.crawlDocs('https://your-docs.com', { depth: 3, maxPages: 100 });
|
|
467
|
+
|
|
468
|
+
// Export directly for your vector DB
|
|
469
|
+
const pineconeData = await wc.exportForVectorDB('https://your-docs.com', {
|
|
470
|
+
format: 'pinecone',
|
|
471
|
+
namespace: 'product-docs',
|
|
472
|
+
});
|
|
473
|
+
// Write to file and import via Pinecone CLI/API
|
|
474
|
+
```
|
|
475
|
+
|
|
476
|
+
### Keep AI context fresh with scheduled re-crawls
|
|
477
|
+
|
|
478
|
+
```typescript
|
|
479
|
+
import { WebContext, CrawlScheduler } from 'webcontext-ai';
|
|
480
|
+
|
|
481
|
+
const wc = new WebContext();
|
|
482
|
+
const scheduler = new CrawlScheduler();
|
|
483
|
+
|
|
484
|
+
scheduler.schedule('docs-sync', {
|
|
485
|
+
cron: '0 */6 * * *',
|
|
486
|
+
urls: ['https://your-docs.com'],
|
|
487
|
+
options: { depth: 2 },
|
|
488
|
+
onComplete: (result) => {
|
|
489
|
+
if (result.diffs?.length) {
|
|
490
|
+
console.log(`${result.diffs.length} pages changed — re-indexing`);
|
|
491
|
+
}
|
|
492
|
+
},
|
|
493
|
+
}, (url, opts) => wc.crawlDocs(url, opts));
|
|
494
|
+
```
|
|
495
|
+
|
|
496
|
+
### Use in a Cursor/Claude workflow
|
|
497
|
+
|
|
498
|
+
Just ask your AI agent:
|
|
499
|
+
- *"Use webcontext to extract the Next.js App Router docs and explain how layouts work"*
|
|
500
|
+
- *"Crawl the Stripe API reference and summarize the payment intents section"*
|
|
501
|
+
- *"Search the React docs for information about useEffect cleanup"*
|
|
502
|
+
|
|
503
|
+
The agent calls the MCP tools automatically.
|
|
504
|
+
|
|
505
|
+
## Troubleshooting
|
|
506
|
+
|
|
507
|
+
### "Executable doesn't exist" / Playwright errors
|
|
508
|
+
|
|
509
|
+
Playwright is only needed for `{ javascript: true }`. Most sites work without it.
|
|
510
|
+
|
|
511
|
+
```bash
|
|
512
|
+
npm install playwright && npx playwright install chromium
|
|
513
|
+
```
|
|
514
|
+
|
|
515
|
+
### "fetch failed" / SSL certificate errors
|
|
516
|
+
|
|
517
|
+
Common in corporate environments:
|
|
518
|
+
|
|
519
|
+
```bash
|
|
520
|
+
# Windows
|
|
521
|
+
set NODE_TLS_REJECT_UNAUTHORIZED=0
|
|
522
|
+
|
|
523
|
+
# Mac/Linux
|
|
524
|
+
export NODE_TLS_REJECT_UNAUTHORIZED=0
|
|
525
|
+
```
|
|
526
|
+
|
|
527
|
+
### Empty extraction / "No pages extracted"
|
|
528
|
+
|
|
529
|
+
1. **SPA sites** (React/Vue/Angular) need `{ javascript: true }` + Playwright
|
|
530
|
+
2. **Landing pages** have little content — target specific doc pages
|
|
531
|
+
3. **Blocked by WAF** — try with custom headers
|
|
532
|
+
|
|
533
|
+
### "pdf-parse is required"
|
|
534
|
+
|
|
535
|
+
```bash
|
|
536
|
+
npm install pdf-parse
|
|
537
|
+
```
|
|
538
|
+
|
|
539
|
+
## Architecture
|
|
540
|
+
|
|
541
|
+
```
|
|
542
|
+
URL → Sitemap Discovery → URL Queue
|
|
543
|
+
↓
|
|
544
|
+
[PDF?] → PDF Extractor
|
|
545
|
+
[GitHub?] → GitHub Extractor
|
|
546
|
+
[Web?] → Browser Manager (fetch/Playwright)
|
|
547
|
+
↓
|
|
548
|
+
Content Extractor (Cheerio + heuristics)
|
|
549
|
+
↓
|
|
550
|
+
Markdown Transformer (Turndown)
|
|
551
|
+
↓
|
|
552
|
+
Deduplication Check
|
|
553
|
+
↓
|
|
554
|
+
Content Chunker (tiktoken, 4 strategies)
|
|
555
|
+
↓
|
|
556
|
+
┌─────────────────────────────────────┐
|
|
557
|
+
│ Vector Search │ Vector DB Export │
|
|
558
|
+
│ Streaming │ Output Templates │
|
|
559
|
+
│ Cache + Diff │ Webhooks │
|
|
560
|
+
└─────────────────────────────────────┘
|
|
561
|
+
↓
|
|
562
|
+
CLI │ REST API │ SDK │ MCP Server │ LangChain
|
|
563
|
+
```
|
|
564
|
+
|
|
565
|
+
## Tech Stack
|
|
566
|
+
|
|
567
|
+
| Component | Technology |
|
|
568
|
+
|-----------|-----------|
|
|
569
|
+
| Browser rendering | Playwright (optional, lazy-loaded) |
|
|
570
|
+
| HTML parsing | Cheerio |
|
|
571
|
+
| Markdown conversion | Turndown (custom rules) |
|
|
572
|
+
| Token counting | tiktoken (cl100k_base) |
|
|
573
|
+
| Vector search | TF-IDF with cosine similarity |
|
|
574
|
+
| PDF parsing | pdf-parse (optional) |
|
|
575
|
+
| HTTP server | Express |
|
|
576
|
+
| CLI | Commander |
|
|
577
|
+
| Caching | LRU-Cache + File-based |
|
|
578
|
+
| Validation | Zod |
|
|
579
|
+
| Rate limiting | Token bucket algorithm |
|
|
580
|
+
|
|
581
|
+
## License
|
|
582
|
+
|
|
583
|
+
MIT
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
/// <reference types="node" />
|
|
2
|
+
import { BrowserConfig, RetryConfig, RateLimitConfig } from '../core/types';
|
|
3
|
+
/**
|
|
4
|
+
* Browser manager using Playwright for JS-heavy page rendering.
|
|
5
|
+
* Handles rate limiting, retry with backoff, and robots.txt compliance.
|
|
6
|
+
*/
|
|
7
|
+
export declare class BrowserManager {
|
|
8
|
+
private browser;
|
|
9
|
+
private context;
|
|
10
|
+
private config;
|
|
11
|
+
private rateLimitConfig;
|
|
12
|
+
private robotsCache;
|
|
13
|
+
private tokens;
|
|
14
|
+
private lastRefill;
|
|
15
|
+
constructor(config?: BrowserConfig, rateLimitConfig?: RateLimitConfig);
|
|
16
|
+
private refillTokens;
|
|
17
|
+
private waitForToken;
|
|
18
|
+
launch(): Promise<void>;
|
|
19
|
+
checkRobots(url: string): Promise<boolean>;
|
|
20
|
+
fetchWithRetry<T>(fn: () => Promise<T>, retryConfig?: RetryConfig): Promise<T>;
|
|
21
|
+
fetchPage(url: string, options?: {
|
|
22
|
+
respectRobots?: boolean;
|
|
23
|
+
waitForSelector?: string;
|
|
24
|
+
timeout?: number;
|
|
25
|
+
cookies?: Array<{
|
|
26
|
+
name: string;
|
|
27
|
+
value: string;
|
|
28
|
+
domain: string;
|
|
29
|
+
path?: string;
|
|
30
|
+
}>;
|
|
31
|
+
headers?: Record<string, string>;
|
|
32
|
+
retryConfig?: RetryConfig;
|
|
33
|
+
}): Promise<{
|
|
34
|
+
content: string;
|
|
35
|
+
status: number;
|
|
36
|
+
}>;
|
|
37
|
+
fetchStatic(url: string, options?: {
|
|
38
|
+
respectRobots?: boolean;
|
|
39
|
+
headers?: Record<string, string>;
|
|
40
|
+
retryConfig?: RetryConfig;
|
|
41
|
+
}): Promise<{
|
|
42
|
+
body: Buffer;
|
|
43
|
+
status: number;
|
|
44
|
+
}>;
|
|
45
|
+
close(): Promise<void>;
|
|
46
|
+
}
|
|
47
|
+
//# sourceMappingURL=manager.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"manager.d.ts","sourceRoot":"","sources":["../../src/browser/manager.ts"],"names":[],"mappings":";AACA,OAAO,EAAE,aAAa,EAAE,WAAW,EAAE,eAAe,EAAE,MAAM,eAAe,CAAC;AAK5E;;;GAGG;AACH,qBAAa,cAAc;IACzB,OAAO,CAAC,OAAO,CAAa;IAC5B,OAAO,CAAC,OAAO,CAAa;IAC5B,OAAO,CAAC,MAAM,CAAgB;IAC9B,OAAO,CAAC,eAAe,CAAkB;IACzC,OAAO,CAAC,WAAW,CAA+B;IAClD,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,UAAU,CAAS;gBAEf,MAAM,GAAE,aAAkB,EAAE,eAAe,CAAC,EAAE,eAAe;IAYzE,OAAO,CAAC,YAAY;YAON,YAAY;IAYpB,MAAM,IAAI,OAAO,CAAC,IAAI,CAAC;IAuBvB,WAAW,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC;IAkB1C,cAAc,CAAC,CAAC,EAAE,EAAE,EAAE,MAAM,OAAO,CAAC,CAAC,CAAC,EAAE,WAAW,GAAE,WAA2B,GAAG,OAAO,CAAC,CAAC,CAAC;IAe7F,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE;QACpC,aAAa,CAAC,EAAE,OAAO,CAAC;QACxB,eAAe,CAAC,EAAE,MAAM,CAAC;QACzB,OAAO,CAAC,EAAE,MAAM,CAAC;QACjB,OAAO,CAAC,EAAE,KAAK,CAAC;YAAE,IAAI,EAAE,MAAM,CAAC;YAAC,KAAK,EAAE,MAAM,CAAC;YAAC,MAAM,EAAE,MAAM,CAAC;YAAC,IAAI,CAAC,EAAE,MAAM,CAAA;SAAE,CAAC,CAAC;QAChF,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QACjC,WAAW,CAAC,EAAE,WAAW,CAAC;KACtB,GAAG,OAAO,CAAC;QAAE,OAAO,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE,CAAC;IAyD/C,WAAW,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE;QACtC,aAAa,CAAC,EAAE,OAAO,CAAC;QACxB,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QACjC,WAAW,CAAC,EAAE,WAAW,CAAC;KACtB,GAAG,OAAO,CAAC;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE,CAAC;IA2B5C,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;CAW7B"}
|